rm(list=ls())
library(pheatmap)
library(ggplot2)
library(dichromat)
library(dplyr)
library(ggrepel)
library(reshape2)
library(umap)
library(ggthemes)
library(cowplot)
library(DEP)
library(readr)
library(naniar)
library(SummarizedExperiment)
library(data.table)
library(readxl)
library(writexl)
library(stringr)
library(vsn)
library(rmarkdown)
library(tidyr)
library(Polychrome)
library(RColorBrewer)
# install.packages("devtools")
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# options(repos = c(
# CRAN = "https://cloud.r-project.org/",
# BiocManager::repositories()))
# ## Install dependencies and package
# devtools::install_github(
# "BioinfOMICS/LipidSigR",
# build_vignettes = TRUE, dependencies = TRUE)
# BiocManager::install(
# c('fgsea', 'gatom', 'mixOmics', 'S4Vectors', 'BiocGenerics',
# 'SummarizedExperiment', 'rgoslin'))
# install.packages(
# c('devtools', 'magrittr', 'plotly', 'tidyverse', 'factoextra', 'ggthemes',
# 'ggforce', 'Hmisc', 'hwordcloud', 'heatmaply', 'iheatmapr', 'Rtsne', 'uwot',
# 'wordcloud', 'rsample', 'ranger', 'caret', 'yardstick', 'fastshap',
# 'SHAPforxgboost', 'visNetwork', 'tidygraph', 'ggraph'))
# devtools::install_github("ctlab/mwcsr")
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# create directory for results
dir.create(file.path(getwd(),'results'), showWarnings = FALSE)
# create directory for plots
dir.create(file.path(getwd(),'plots'), showWarnings = FALSE)
Loading data including:
The first step is to merge the information of the “All templates” and “Shipment of all boxes” files, and create 3 datasets (one for plasma, one for CSF, one for urine) with the information of Patient ID, Position, Biofluid, Visit, Tube number, Country, Rack and Box.
## sample info - CSF
paged_table(lipidomics_CSF_all)
dim(lipidomics_CSF_all)
## [1] 415 8
lipidomics_CSF_all_summary <- lipidomics_CSF_all %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE),
total_tubes = n_distinct(Tube_number, na.rm = TRUE),
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_CSF_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_CSF_all_V0 = lipidomics_CSF_all %>%
filter(Visit == "V0")
paged_table(lipidomics_CSF_all_V0 %>%
filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_CSF_all_V0 %>%
filter(Patient %in% lipidomics_CSF_all_V0[duplicated(lipidomics_CSF_all_V0$Patient) & !is.na(lipidomics_CSF_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_CSF_all_V1 = lipidomics_CSF_all %>%
filter(Visit == "V1")
paged_table(lipidomics_CSF_all_V1 %>%
filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_CSF_all_V1 %>%
filter(Patient %in% lipidomics_CSF_all_V1[duplicated(lipidomics_CSF_all_V1$Patient) & !is.na(lipidomics_CSF_all_V1$Patient),]$Patient))
## sample info - plasma
paged_table(lipidomics_plasma_all)
dim(lipidomics_plasma_all)
## [1] 415 8
lipidomics_plasma_all_summary <- lipidomics_plasma_all %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE),
total_tubes = n_distinct(Tube_number, na.rm = TRUE),
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_plasma_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_plasma_all_V0 = lipidomics_plasma_all %>%
filter(Visit == "V0")
paged_table(lipidomics_plasma_all_V0 %>%
filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_plasma_all_V0 %>%
filter(Patient %in% lipidomics_plasma_all_V0[duplicated(lipidomics_plasma_all_V0$Patient) & !is.na(lipidomics_plasma_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_plasma_all_V1 = lipidomics_plasma_all %>%
filter(Visit == "V1")
paged_table(lipidomics_plasma_all_V1 %>%
filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_plasma_all_V1 %>%
filter(Patient %in% lipidomics_plasma_all_V1[duplicated(lipidomics_plasma_all_V1$Patient) & !is.na(lipidomics_plasma_all_V1$Patient),]$Patient))
## sample info - urine
paged_table(lipidomics_urine_all)
dim(lipidomics_urine_all)
## [1] 384 8
lipidomics_urine_all_summary <- lipidomics_urine_all %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE),
total_tubes = n_distinct(Tube_number, na.rm = TRUE),
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_urine_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_urine_all_V0 = lipidomics_urine_all %>%
filter(Visit == "V0")
paged_table(lipidomics_urine_all_V0 %>%
filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_urine_all_V0 %>%
filter(Patient %in% lipidomics_urine_all_V0[duplicated(lipidomics_urine_all_V0$Patient) & !is.na(lipidomics_urine_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_urine_all_V1 = lipidomics_urine_all %>%
filter(Visit == "V1")
paged_table(lipidomics_urine_all_V1 %>%
filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_urine_all_V1 %>%
filter(Patient %in% lipidomics_urine_all_V1[duplicated(lipidomics_urine_all_V1$Patient) & !is.na(lipidomics_urine_all_V1$Patient),]$Patient))
The second step is to aggregate all the raw data files together and save the batch information, in order to check later if there are any batch effects in the data.
paged_table(raw_data_CSF_final)
dim(raw_data_CSF_final)
## [1] 174 894
# how many rows of QC
length(grep("QC",raw_data_CSF_final$Tube_number))
## [1] 30
paged_table(raw_data_plasma_final)
dim(raw_data_plasma_final)
## [1] 382 894
# how many rows of QC
length(grep("QC",raw_data_plasma_final$Tube_number))
## [1] 66
The third step is to combine the raw data files with the patient information, and check the amount of samples per visit for each fluid.
raw_data_CSF_IDs = raw_data_CSF_final %>%
select(Tube_number,Batch)
raw_data_CSF_IDs <- raw_data_CSF_IDs %>%
mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
filter(!str_detect(Tube_number, "QC")) # remove 30 QC IDs
# how many IDs in the raw data CSF
length(raw_data_CSF_IDs$Tube_number)
## [1] 144
# how many unique IDs in the raw data CSF
length(unique(raw_data_CSF_IDs$Tube_number))
## [1] 144
lipidomics_CSF_all <- lipidomics_CSF_all %>%
mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_CSF = raw_data_CSF_IDs %>%
left_join(lipidomics_CSF_all,by = "Tube_number") %>%
mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_CSF = full_match_CSF %>%
filter(is.na(match_type)) %>%
dplyr::rename(Tube_number4 = Tube_number4.y) %>%
select(-names(lipidomics_CSF_all)) %>%
dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_CSF = no_match_CSF %>%
left_join(lipidomics_CSF_all %>%
filter(!Tube_number %in% (full_match_CSF %>% filter(!is.na(match_type)) %>%
pull(Tube_number))),
by = "Tube_number4") %>%
mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))
# final description of the samples available in the raw data without QC samples
combined_data_CSF = full_match_CSF %>%
filter(!is.na(match_type)) %>%
select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
bind_rows(last4_match_CSF %>%
select(-Tube_number) %>%
left_join(raw_data_CSF_IDs %>% select(Tube_number,Tube_number4) %>%
filter(!Tube_number %in% (full_match_CSF %>% filter(!is.na(match_type)) %>%
pull(Tube_number)))) %>%
select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
distinct()
paged_table(combined_data_CSF)
# how many IDs in the raw data CSF now
length(combined_data_CSF$Tube_number)
## [1] 144
# how many unique IDs in the raw data CSF now
length(unique(combined_data_CSF$Tube_number))
## [1] 144
# how many unique Patient IDs
length(unique(na.omit(combined_data_CSF$Patient)))
## [1] 112
# how many tube numbers and patient IDs per V0 and V1
combined_data_CSF %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# which Tube numbers at V1 do not have a patient ID
combined_data_CSF %>%
filter(Visit == "V1") %>%
filter(is.na(Patient))
# patients at V0 and V1
patients_CSF_V0 <- combined_data_CSF %>%
filter(Visit == "V0") %>%
pull(Patient) %>%
unique()
patients_CSF_V1 <- combined_data_CSF %>%
filter(Visit == "V1") %>%
pull(Patient) %>%
unique() %>%
na.omit()
# check if all patients at V1 are at V0
all(patients_CSF_V1 %in% patients_CSF_V0)
## [1] FALSE
patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0] # patient TR319
## [1] "TR319"
# check patient in the info_patient dataset
lipidomics_CSF_all[lipidomics_CSF_all$Patient %in% patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0],]
raw_data_plasma_IDs = raw_data_plasma_final %>%
select(Tube_number,Batch)
raw_data_plasma_IDs <- raw_data_plasma_IDs %>%
mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
filter(!str_detect(Tube_number, "QC")) # remove 66 QC IDs
# how many IDs in the raw data plasma
length(raw_data_plasma_IDs$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma
length(unique(raw_data_plasma_IDs$Tube_number))
## [1] 316
lipidomics_plasma_all <- lipidomics_plasma_all %>%
mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_plasma = raw_data_plasma_IDs %>%
left_join(lipidomics_plasma_all,by = "Tube_number") %>%
mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_plasma = full_match_plasma %>%
filter(is.na(match_type)) %>%
dplyr::rename(Tube_number4 = Tube_number4.y) %>%
select(-names(lipidomics_plasma_all)) %>%
dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_plasma = no_match_plasma %>%
left_join(lipidomics_plasma_all,by = "Tube_number4") %>%
mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))
# final description of the samples available in the raw data without QC samples
combined_data_plasma = full_match_plasma %>%
filter(!is.na(match_type)) %>%
select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
bind_rows(last4_match_plasma %>%
select(-Tube_number) %>%
left_join(raw_data_plasma_IDs %>% select(Tube_number,Tube_number4)) %>%
select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
distinct()
paged_table(combined_data_plasma)
# how many IDs in the raw data plasma now
length(combined_data_plasma$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma now
length(unique(combined_data_plasma$Tube_number))
## [1] 316
# how many unique Patient IDs
length(unique(na.omit(combined_data_plasma$Patient)))
## [1] 200
# how many tube numbers and patient IDs per V0 and V1
combined_data_plasma %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# which Tube numbers at V0 do not have a patient ID
combined_data_plasma %>%
filter(Visit == "V0") %>%
filter(is.na(Patient))
# which Tube numbers at V1 do not have a patient ID
combined_data_plasma %>%
filter(Visit == "V1") %>%
filter(is.na(Patient))
# patients at V0 and V1
patients_plasma_V0 <- combined_data_plasma %>%
filter(Visit == "V0") %>%
pull(Patient) %>%
unique()
patients_plasma_V1 <- combined_data_plasma %>%
filter(Visit == "V1") %>%
pull(Patient) %>%
unique() %>%
na.omit()
# check if all patients at V1 are at V0
all(patients_plasma_V1 %in% patients_plasma_V0)
## [1] FALSE
patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0] # patients TR207, TR116 and TR123
## [1] "TR207" "TR116" "TR123"
# check patient in the info_patient dataset
lipidomics_plasma_all[lipidomics_plasma_all$Patient %in% patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0],]
The fourth step is to give an overview of how many samples of eALS, PGMC, CTR and mimic we have per fluid, visit and country.
# IDs in CSF (131)
IDs_CSF <- read_delim("data input/export-2025-11-07-PREMODIALS-AKDTR_BRNO_CHUFR_HMCIL_HRO_KSSGCH_MRI_NIUSASSK/CSFAcquisition.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE) %>%
filter(CsfSexists == 1) %>%
group_by(PatientID) %>%
mutate(Visit = paste0("V", row_number() - 1)) %>%
ungroup() %>%
select(PatientID,Visit) %>%
distinct()
IDs_CSF_data_1 = IDs_CSF %>%
left_join(GeneralDocumentation %>% select(PatientID,ParticipantCode)) %>%
dplyr::rename(Patient = ParticipantCode) %>%
distinct() %>%
left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
mutate(
Country = dplyr::case_when(
grepl("TR", Patient) ~ "Turkey",
grepl("CH", Patient) ~ "Switzerland",
grepl("DE", Patient) ~ "Germany",
grepl("SK", Patient) ~ "Slovakia",
grepl("FR", Patient) ~ "France",
grepl("IL", Patient) ~ "Israel",
TRUE ~ NA_character_
)) %>%
arrange(Patient)
paged_table(IDs_CSF_data_1)
overview_country_CSF_1 = IDs_CSF_data_1 %>%
group_by(Visit,Country) %>%
summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
arrange(Visit,desc(n_unique_patients))
overview_country_CSF_1
overview_status_CSF_1 = IDs_CSF_data_1 %>%
group_by(type,Visit,Country) %>%
distinct() %>%
summarise(nr_samples = n_distinct(Patient)) %>%
select(type, Visit,Country,nr_samples) %>%
pivot_wider(
names_from = type,
values_from = nr_samples,
values_fill = 0) %>%
select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,`NA`)
overview_status_CSF_1
IDs_CSF_data = combined_data_CSF %>%
select(Patient,Visit,Tube_number) %>%
distinct() %>%
left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
mutate(
Country = dplyr::case_when(
grepl("TR", Patient) ~ "Turkey",
grepl("CH", Patient) ~ "Switzerland",
grepl("DE", Patient) ~ "Germany",
grepl("SK", Patient) ~ "Slovakia",
grepl("FR", Patient) ~ "France",
grepl("IL", Patient) ~ "Israel",
TRUE ~ NA_character_
)) %>%
arrange(Patient)
paged_table(IDs_CSF_data)
overview_country_CSF = IDs_CSF_data %>%
group_by(Visit,Country) %>%
summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
arrange(Visit,desc(n_unique_patients))
overview_country_CSF
overview_status_CSF = IDs_CSF_data %>%
group_by(type,Visit,Country) %>%
distinct() %>%
summarise(nr_samples = n_distinct(Patient)) %>%
select(type, Visit,Country,nr_samples) %>%
pivot_wider(
names_from = type,
values_from = nr_samples,
values_fill = 0) %>%
select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,N.A.,`NA`)
overview_status_CSF
# IDs in plasma (235)
IDs_plasma <- read_delim("data input/export-2025-11-07-PREMODIALS-AKDTR_BRNO_CHUFR_HMCIL_HRO_KSSGCH_MRI_NIUSASSK/BloodAcquisition.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE) %>%
filter(BAexists == 1) %>%
group_by(PatientID) %>%
mutate(Visit = paste0("V", row_number() - 1)) %>%
ungroup() %>%
select(PatientID,Visit) %>%
distinct()
IDs_plasma_data_1 = IDs_plasma %>%
left_join(GeneralDocumentation %>% select(PatientID,ParticipantCode)) %>%
dplyr::rename(Patient = ParticipantCode) %>%
distinct() %>%
left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
mutate(
Country = dplyr::case_when(
grepl("TR", Patient) ~ "Turkey",
grepl("CH", Patient) ~ "Switzerland",
grepl("DE", Patient) ~ "Germany",
grepl("SK", Patient) ~ "Slovakia",
grepl("FR", Patient) ~ "France",
grepl("IL", Patient) ~ "Israel",
TRUE ~ NA_character_
)) %>%
arrange(Patient)
paged_table(IDs_plasma_data_1)
overview_country_plasma_1 = IDs_plasma_data_1 %>%
group_by(Visit,Country) %>%
summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
arrange(Visit,desc(n_unique_patients))
overview_country_plasma_1
overview_status_plasma_1 = IDs_plasma_data_1 %>%
group_by(type,Visit,Country) %>%
distinct() %>%
summarise(nr_samples = n_distinct(Patient)) %>%
select(type, Visit,Country,nr_samples) %>%
pivot_wider(
names_from = type,
values_from = nr_samples,
values_fill = 0) %>%
select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,`NA`)
overview_status_plasma_1
IDs_plasma_data = combined_data_plasma %>%
select(Patient,Visit,Tube_number) %>%
distinct() %>%
left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
mutate(
Country = dplyr::case_when(
grepl("TR", Patient) ~ "Turkey",
grepl("CH", Patient) ~ "Switzerland",
grepl("DE", Patient) ~ "Germany",
grepl("SK", Patient) ~ "Slovakia",
grepl("FR", Patient) ~ "France",
grepl("IL", Patient) ~ "Israel",
TRUE ~ NA_character_
)) %>%
arrange(Patient)
paged_table(IDs_plasma_data)
overview_country_plasma = IDs_plasma_data %>%
group_by(Visit,Country) %>%
summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
arrange(Visit,desc(n_unique_patients))
overview_country_plasma
overview_status_plasma = IDs_plasma_data %>%
group_by(type,Visit,Country) %>%
distinct() %>%
summarise(nr_samples = n_distinct(Patient)) %>%
select(type, Visit,Country,nr_samples) %>%
pivot_wider(
names_from = type,
values_from = nr_samples,
values_fill = 0) %>%
select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,N.A.,`NA`)
overview_status_plasma
The fifth step is to remove the QC rows, save the experimental setup (info of patient ID, batch, sample ID, visit, etc.), structure the lipidomics dataset to numeric and create summarised experiments datasets.
# removal of QC rows
raw_data_CSF_use = raw_data_CSF_final %>%
filter(!str_detect(Tube_number, "QC"))
tube_batch_CSF = raw_data_CSF_use[,1:2] %>%
left_join(combined_data_CSF %>% select(Tube_number,Batch,Patient,Visit))
raw_data_CSF_use_mat = as.matrix(raw_data_CSF_use[,3:ncol(raw_data_CSF_use)])
rownames(raw_data_CSF_use_mat) = tube_batch_CSF %>% pull(Tube_number)
# matrix with raw lipidomics data (rows - lipids and column - tube ID)
lipidomics_CSF = as.data.frame(t(raw_data_CSF_use_mat))
dim(lipidomics_CSF)
## [1] 892 144
# make columns numeric
for(i in 1:ncol(lipidomics_CSF)){lipidomics_CSF[,i] = as.numeric(lipidomics_CSF[,i])}
#make a list for all dataframes
lipidomics_data_CSF = list(raw_data = lipidomics_CSF)
## summarized objects
tube_batch_CSF = tube_batch_CSF %>%
left_join(IDs_CSF_data) %>%
mutate(subtype = ifelse(type == "C9orf72","C9orf72",
ifelse(type == "SOD1","SOD1",
ifelse(type == "FUS","FUS",
ifelse(type == "other","other",
ifelse(type == "TARDBP","TARDBP",NA)))))) %>%
mutate(type = ifelse(type %in% c("C9orf72","SOD1","FUS","TARDBP","other"),
"PGMC",type)) %>%
arrange(subtype) %>%
distinct(Tube_number, Batch,Patient,PatientID,type, .keep_all = TRUE)
# Check which tube numbers do not have a patient ID or type assigned (majority uncertain PURE MOTOR SYMPTOM (PMS) / MND MIMIC / EARLY ALS (EALS))
tube_number_CSF_missing = tube_batch_CSF %>%
filter(is.na(Patient) | is.na(type)) %>%
arrange(Patient)
tube_number_CSF_missing
GeneralDocumentation %>%
select(ParticipantCode,PatientID,PGMC,ALSuncertainty,ALSFUdiagnosis) %>%
filter(ParticipantCode %in% tube_number_CSF_missing$Patient) %>%
filter(!is.na(ParticipantCode))
abundance.columns <- 1:ncol(lipidomics_data_CSF$raw_data) # get abundance column numbers
clin = data.frame(label = tube_batch_CSF$Tube_number,
condition = tube_batch_CSF$type,
sub_condition = tube_batch_CSF$subtype,
batch = tube_batch_CSF$Batch)
lipidomics_data_CSF$raw_data$name = lipidomics_data_CSF$raw_data$ID = rownames(lipidomics_data_CSF$raw_data)
experimental.design = clin
experimental.design <- experimental.design %>%
group_by(condition) %>%
mutate(replicate = row_number()) %>%
ungroup()
lipidomics_data_CSF$se <- make_se(lipidomics_data_CSF$raw_data, abundance.columns, experimental.design)
lipidomics_data_CSF$se
## class: SummarizedExperiment
## dim: 892 144
## metadata(0):
## assays(1): ''
## rownames(892): CE(18:1-d7) Cholesterol(d7) ... PS(38:4)_PS(18:0/20:4)
## PS(40:4)_PS(20:0/20:4)
## rowData names(2): ID name
## colnames(144): PGMC_1 PGMC_2 ... CTR_26 CTR_27
## colData names(6): label ID ... batch replicate
writexl::write_xlsx(lipidomics_data_CSF$raw_data,"results/lipidomics_CSF_raw_data.xlsx")
# removal of QC rows
raw_data_plasma_use = raw_data_plasma_final %>%
filter(!str_detect(Tube_number, "QC"))
tube_batch_plasma = raw_data_plasma_use[,1:2] %>%
left_join(combined_data_plasma %>% select(Tube_number,Batch,Patient,Visit))
raw_data_plasma_use_mat = as.matrix(raw_data_plasma_use[,3:ncol(raw_data_plasma_use)])
rownames(raw_data_plasma_use_mat) = tube_batch_plasma %>% pull(Tube_number)
# matrix with raw lipidomics data (rows - lipids and column - tube ID)
lipidomics_plasma = as.data.frame(t(raw_data_plasma_use_mat))
dim(lipidomics_plasma)
## [1] 892 316
# make columns numeric
for(i in 1:ncol(lipidomics_plasma)){lipidomics_plasma[,i] = as.numeric(lipidomics_plasma[,i])}
#make a list for all dataframes
lipidomics_data_plasma = list(raw_data = lipidomics_plasma)
## summarized objects
tube_batch_plasma = tube_batch_plasma %>%
left_join(IDs_plasma_data) %>%
mutate(subtype = ifelse(type == "C9orf72","C9orf72",
ifelse(type == "SOD1","SOD1",
ifelse(type == "FUS","FUS",
ifelse(type == "other","other",
ifelse(type == "TARDBP","TARDBP",NA)))))) %>%
mutate(type = ifelse(type %in% c("C9orf72","SOD1","FUS","TARDBP","other"),
"PGMC",type)) %>%
arrange(subtype) %>%
distinct(Tube_number, Batch,Patient,PatientID,type, .keep_all = TRUE)
# Check which tube numbers do not have a patient ID or type assigned (majority uncertain PURE MOTOR SYMPTOM (PMS) / MND MIMIC / EARLY ALS (EALS))
tube_number_plasma_missing = tube_batch_plasma %>%
filter(is.na(Patient) | is.na(type)) %>%
arrange(Patient)
tube_number_plasma_missing
GeneralDocumentation %>%
select(ParticipantCode,PatientID,PGMC,ALSuncertainty,ALSFUdiagnosis) %>%
filter(ParticipantCode %in% tube_number_plasma_missing$Patient) %>%
filter(!is.na(ParticipantCode))
abundance.columns <- 1:ncol(lipidomics_data_plasma$raw_data) # get abundance column numbers
clin = data.frame(label = tube_batch_plasma$Tube_number,
condition = tube_batch_plasma$type,
sub_condition = tube_batch_plasma$subtype,
batch = tube_batch_plasma$Batch)
lipidomics_data_plasma$raw_data$name = lipidomics_data_plasma$raw_data$ID = rownames(lipidomics_data_plasma$raw_data)
experimental.design = clin
experimental.design <- experimental.design %>%
group_by(condition) %>%
mutate(replicate = row_number()) %>%
ungroup()
lipidomics_data_plasma$se <- make_se(lipidomics_data_plasma$raw_data, abundance.columns, experimental.design)
lipidomics_data_plasma$se
## class: SummarizedExperiment
## dim: 892 316
## metadata(0):
## assays(1): ''
## rownames(892): CE(18:1-d7) Cholesterol(d7) ... PS(38:4)_PS(18:0/20:4)
## PS(40:4)_PS(20:0/20:4)
## rowData names(2): ID name
## colnames(316): PGMC_1 PGMC_2 ... mimic_32 ALS_91
## colData names(6): label ID ... batch replicate
writexl::write_xlsx(lipidomics_data_plasma$raw_data,"results/lipidomics_plasma_raw_data.xlsx")
The sixth step is to check for missing omics per patient and overall, filter lipids that are not quantified in less than 1/3 of the samples and normalise the data (using vsn).
# heatmap missing
vis_miss(lipidomics_data_CSF$raw_data,show_perc = TRUE,show_perc_col = TRUE,cluster = F)
ggsave("plots/missing_vis_miss_heatmap_CSF.png", width = 15, height = 10, units = "in")
# filter for lipids that are quantified in at least 2/3 of the samples
lipidomics_data_CSF$se_filtered = filter_proteins(lipidomics_data_CSF$se,"fraction",min = 0.66)
#dimensions of the data
dim(lipidomics_data_CSF$se)
## [1] 892 144
dim(lipidomics_data_CSF$se_filtered)
## [1] 186 144
# heatmap missing for filtered data
vis_miss(as.data.frame(assay(lipidomics_data_CSF$se_filtered)),show_perc = TRUE,show_perc_col = TRUE,cluster = F)
ggsave("plots/missing_vis_miss_heatmap_filtered_CSF.png", width = 12, height = 8, units = "in")
plot_frequency(lipidomics_data_CSF$se)
ggsave("plots/frequency_met_identification_raw_CSF.pdf", width = 11, height = 8, units = "in")
plot_frequency(lipidomics_data_CSF$se_filtered)
ggsave("plots/frequency_met_identification_filtered_CSF.pdf", width = 11, height = 8, units = "in")
# % missing per patient:
round(apply(X = as.data.frame(assay(lipidomics_data_CSF$se)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_CSF$se))) * 100 , 1)
## PGMC_1 PGMC_2 PGMC_3 PGMC_4 PGMC_5 PGMC_6 PGMC_7 PGMC_8
## 77.5 76.3 77.9 77.5 77.4 77.7 77.0 77.2
## PGMC_9 PGMC_10 PGMC_11 PGMC_12 PGMC_13 PGMC_14 PGMC_15 PGMC_16
## 79.0 79.8 76.9 78.6 76.7 78.6 77.8 77.7
## PGMC_17 PGMC_18 PGMC_19 PGMC_20 PGMC_21 PGMC_22 PGMC_23 PGMC_24
## 78.3 78.6 77.1 76.6 77.5 77.6 78.9 79.0
## PGMC_25 PGMC_26 PGMC_27 PGMC_28 PGMC_29 PGMC_30 CTR_1 mimic_1
## 78.6 78.0 77.5 77.2 78.4 77.8 76.5 73.0
## CTR_2 mimic_2 ALS_1 CTR_3 SYMP_1 ALS_2 ALS_3 CTR_4
## 77.1 75.8 73.8 73.3 78.3 75.9 76.5 76.9
## mimic_3 ALS_4 CTR_5 CTR_6 mimic_4 mimic_5 SYMP_2 ALS_5
## 76.1 76.2 75.9 77.2 77.0 74.8 78.0 76.3
## mimic_6 ALS_6 ALS_7 CTR_7 mimic_7 CTR_8 mimic_8 ALS_8
## 76.3 78.5 76.1 76.3 76.1 76.8 76.1 75.8
## ALS_9 CTR_9 ALS_10 mimic_9 ALS_11 ALS_12 ALS_13 mimic_10
## 76.6 76.9 75.7 72.5 74.6 73.9 73.5 76.9
## ALS_14 SYMP_3 ALS_15 CTR_10 mimic_11 CTR_11 ALS_16 CTR_12
## 78.3 76.2 78.4 77.7 78.4 78.6 75.0 72.9
## CTR_13 CTR_14 mimic_12 CTR_15 CTR_16 SYMP_4 mimic_13 ALS_17
## 78.4 77.1 77.6 77.6 78.0 77.8 75.4 77.9
## ALS_18 mimic_14 CTR_17 ALS_19 ALS_20 mimic_15 ALS_21 ALS_22
## 79.1 75.6 76.9 77.0 78.6 76.6 79.9 75.8
## CTR_18 mimic_16 ALS_23 ALS_24 SYMP_5 CTR_19 ALS_25 ALS_26
## 77.9 76.0 76.2 75.3 77.9 77.7 78.1 78.8
## ALS_27 ALS_28 SYMP_6 ALS_29 ALS_30 SYMP_7 ALS_31 ALS_32
## 78.5 79.3 80.0 78.0 79.0 78.1 77.5 78.0
## ALS_33 CTR_20 SYMP_8 mimic_17 SYMP_9 CTR_21 CTR_22 ALS_34
## 75.2 79.0 77.9 77.5 76.5 77.2 80.4 81.1
## ALS_35 ALS_36 ALS_37 CTR_23 ALS_38 ALS_39 ALS_40 mimic_18
## 76.6 74.4 79.3 79.1 77.1 78.7 77.9 78.8
## ALS_41 N.A._1 SYMP_10 CTR_24 ALS_42 ALS_43 ALS_44 ALS_45
## 77.5 79.4 78.1 79.0 77.8 76.0 76.3 77.7
## CTR_25 mimic_19 mimic_20 NA._1 NA._2 ALS_46 ALS_47 ALS_48
## 78.8 79.8 77.9 59.3 78.3 78.4 78.0 76.6
## ALS_49 N.A._2 mimic_21 ALS_50 mimic_22 ALS_51 CTR_26 CTR_27
## 74.9 81.6 80.2 77.4 76.9 77.0 78.3 78.5
round(apply(X = as.data.frame(assay(lipidomics_data_CSF$se_filtered)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_CSF$se_filtered))) * 100 , 1)
## PGMC_1 PGMC_2 PGMC_3 PGMC_4 PGMC_5 PGMC_6 PGMC_7 PGMC_8
## 1.6 2.2 5.4 1.1 2.2 2.2 0.5 2.7
## PGMC_9 PGMC_10 PGMC_11 PGMC_12 PGMC_13 PGMC_14 PGMC_15 PGMC_16
## 6.5 6.5 7.0 3.2 4.8 3.8 3.8 3.8
## PGMC_17 PGMC_18 PGMC_19 PGMC_20 PGMC_21 PGMC_22 PGMC_23 PGMC_24
## 3.2 4.8 2.2 0.5 2.2 2.7 7.0 3.8
## PGMC_25 PGMC_26 PGMC_27 PGMC_28 PGMC_29 PGMC_30 CTR_1 mimic_1
## 3.2 3.8 3.8 4.8 3.2 2.7 2.2 0.0
## CTR_2 mimic_2 ALS_1 CTR_3 SYMP_1 ALS_2 ALS_3 CTR_4
## 5.4 0.5 0.5 0.0 3.2 1.1 1.1 1.1
## mimic_3 ALS_4 CTR_5 CTR_6 mimic_4 mimic_5 SYMP_2 ALS_5
## 1.6 2.7 1.1 2.7 1.6 2.2 3.8 1.1
## mimic_6 ALS_6 ALS_7 CTR_7 mimic_7 CTR_8 mimic_8 ALS_8
## 2.2 4.8 1.6 1.1 1.1 2.2 1.1 0.5
## ALS_9 CTR_9 ALS_10 mimic_9 ALS_11 ALS_12 ALS_13 mimic_10
## 1.1 1.6 1.1 2.2 0.5 0.5 0.5 1.6
## ALS_14 SYMP_3 ALS_15 CTR_10 mimic_11 CTR_11 ALS_16 CTR_12
## 6.5 2.2 3.2 1.6 3.8 2.7 0.5 2.7
## CTR_13 CTR_14 mimic_12 CTR_15 CTR_16 SYMP_4 mimic_13 ALS_17
## 2.7 1.6 3.8 2.2 2.7 3.8 1.1 2.2
## ALS_18 mimic_14 CTR_17 ALS_19 ALS_20 mimic_15 ALS_21 ALS_22
## 6.5 1.1 1.1 1.1 4.3 2.2 8.1 1.1
## CTR_18 mimic_16 ALS_23 ALS_24 SYMP_5 CTR_19 ALS_25 ALS_26
## 2.2 1.6 1.6 1.1 2.2 2.2 3.2 4.3
## ALS_27 ALS_28 SYMP_6 ALS_29 ALS_30 SYMP_7 ALS_31 ALS_32
## 3.2 5.9 9.1 5.4 6.5 4.3 4.8 2.7
## ALS_33 CTR_20 SYMP_8 mimic_17 SYMP_9 CTR_21 CTR_22 ALS_34
## 3.2 5.4 4.3 3.2 2.2 2.7 11.3 14.0
## ALS_35 ALS_36 ALS_37 CTR_23 ALS_38 ALS_39 ALS_40 mimic_18
## 2.7 1.1 8.1 6.5 1.6 7.0 3.2 5.4
## ALS_41 N.A._1 SYMP_10 CTR_24 ALS_42 ALS_43 ALS_44 ALS_45
## 1.6 6.5 3.2 7.5 4.3 2.7 0.5 2.2
## CTR_25 mimic_19 mimic_20 NA._1 NA._2 ALS_46 ALS_47 ALS_48
## 7.0 9.7 3.8 0.5 3.8 3.8 3.8 1.6
## ALS_49 N.A._2 mimic_21 ALS_50 mimic_22 ALS_51 CTR_26 CTR_27
## 0.5 17.7 8.6 3.2 1.6 3.2 4.8 5.4
#normalization
lipidomics_data_CSF$se_filt_norm <- normalize_vsn(lipidomics_data_CSF$se_filtered)
DEP::meanSdPlot(lipidomics_data_CSF$se_filt_norm)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the vsn package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filtered)),"results/data_filtered_CSF.xlsx")
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filt_norm)),"results/data_filtered_normalized_CSF.xlsx")
# imputation
lipidomics_data_CSF$se_filt_impute <- impute(
lipidomics_data_CSF$se_filt_norm,fun = "MinProb",q = 0.01)
## Imputing along margin 2 (samples/columns).
## [1] 0.4905946
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filt_impute)),"results/data_filtered_normalized_imputed_CSF.xlsx")
# heatmap missing
vis_miss(lipidomics_data_plasma$raw_data,show_perc = TRUE,show_perc_col = TRUE,cluster = F)
ggsave("plots/missing_vis_miss_heatmap_plasma.png", width = 15, height = 10, units = "in")
# filter for lipids that are quantified in at least 2/3 of the samples
lipidomics_data_plasma$se_filtered = filter_proteins(lipidomics_data_plasma$se,"fraction",min = 0.66)
#dimensions of the data
dim(lipidomics_data_plasma$se)
## [1] 892 316
dim(lipidomics_data_plasma$se_filtered)
## [1] 684 316
# heatmap missing for filtered data
vis_miss(as.data.frame(assay(lipidomics_data_plasma$se_filtered)),show_perc = TRUE,show_perc_col = TRUE,cluster = F)
ggsave("plots/missing_vis_miss_heatmap_filtered_plasma.png", width = 12, height = 8, units = "in")
plot_frequency(lipidomics_data_plasma$se)
ggsave("plots/frequency_met_identification_raw_plasma.pdf", width = 11, height = 8, units = "in")
plot_frequency(lipidomics_data_plasma$se_filtered)
ggsave("plots/frequency_met_identification_filtered_plasma.pdf", width = 11, height = 8, units = "in")
# % missing per patient:
round(apply(X = as.data.frame(assay(lipidomics_data_plasma$se)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_plasma$se))) * 100 , 1)
## PGMC_1 PGMC_2 PGMC_3 PGMC_4 PGMC_5 PGMC_6 PGMC_7 PGMC_8
## 16.6 18.7 19.7 19.1 19.5 18.8 20.0 20.7
## PGMC_9 PGMC_10 PGMC_11 PGMC_12 PGMC_13 PGMC_14 PGMC_15 PGMC_16
## 22.0 22.6 23.1 23.7 22.3 21.1 21.3 22.2
## PGMC_17 PGMC_18 PGMC_19 PGMC_20 PGMC_21 PGMC_22 PGMC_23 PGMC_24
## 22.4 22.3 27.9 29.1 24.3 27.5 28.4 24.4
## PGMC_25 PGMC_26 PGMC_27 PGMC_28 PGMC_29 PGMC_30 PGMC_31 PGMC_32
## 26.9 26.8 24.8 23.1 22.1 22.6 22.1 16.8
## PGMC_33 PGMC_34 PGMC_35 PGMC_36 PGMC_37 PGMC_38 PGMC_39 PGMC_40
## 16.7 16.7 18.6 20.2 19.3 17.8 18.0 18.5
## PGMC_41 PGMC_42 PGMC_43 PGMC_44 PGMC_45 PGMC_46 PGMC_47 PGMC_48
## 20.4 20.0 16.6 18.2 18.5 19.1 17.9 18.5
## PGMC_49 PGMC_50 PGMC_51 PGMC_52 PGMC_53 PGMC_54 PGMC_55 PGMC_56
## 17.9 19.8 23.8 25.0 20.0 20.1 21.5 21.3
## PGMC_57 PGMC_58 PGMC_59 PGMC_60 PGMC_61 PGMC_62 PGMC_63 PGMC_64
## 22.4 23.0 27.5 17.4 15.5 17.9 18.2 16.4
## PGMC_65 PGMC_66 PGMC_67 PGMC_68 PGMC_69 PGMC_70 PGMC_71 PGMC_72
## 16.8 17.5 18.6 16.7 18.0 23.3 23.0 16.1
## PGMC_73 PGMC_74 PGMC_75 PGMC_76 PGMC_77 PGMC_78 PGMC_79 PGMC_80
## 19.4 19.1 20.1 18.7 23.1 18.3 21.6 20.6
## PGMC_81 PGMC_82 PGMC_83 PGMC_84 PGMC_85 ALS_1 CTR_1 CTR_2
## 19.1 19.7 22.9 28.1 25.4 18.2 17.9 17.6
## ALS_2 CTR_3 ALS_3 mimic_1 CTR_4 ALS_4 ALS_5 ALS_6
## 16.0 17.8 17.9 20.9 15.7 16.8 15.8 17.2
## SYMP_1 mimic_2 ALS_7 CTR_5 CTR_6 mimic_3 ALS_8 ALS_9
## 16.4 15.7 14.1 16.1 15.5 18.3 18.4 18.2
## CTR_7 CTR_8 CTR_9 SYMP_2 CTR_10 SYMP_3 CTR_11 ALS_10
## 21.7 16.6 16.8 17.2 18.8 16.7 18.2 16.4
## ALS_11 ALS_12 CTR_12 SYMP_4 mimic_4 ALS_13 CTR_13 ALS_14
## 16.5 14.8 17.4 14.8 17.4 15.4 16.0 15.4
## ALS_15 CTR_14 mimic_5 mimic_6 mimic_7 ALS_16 CTR_15 CTR_16
## 16.8 17.9 18.3 20.0 19.2 21.9 18.0 17.2
## mimic_8 ALS_17 CTR_17 ALS_18 CTR_18 mimic_9 CTR_19 CTR_20
## 17.0 18.4 17.6 17.2 19.8 19.3 17.5 18.5
## mimic_10 CTR_21 CTR_22 mimic_11 ALS_19 CTR_23 mimic_12 ALS_20
## 17.5 17.8 20.1 18.5 16.8 19.2 20.1 17.7
## ALS_21 ALS_22 ALS_23 ALS_24 CTR_24 CTR_25 ALS_25 CTR_26
## 23.8 16.1 19.5 17.3 21.0 19.7 21.1 19.3
## CTR_27 ALS_26 ALS_27 CTR_28 CTR_29 CTR_30 CTR_31 CTR_32
## 20.0 18.9 17.2 16.7 17.3 18.9 19.6 19.3
## CTR_33 ALS_28 ALS_29 mimic_13 ALS_30 CTR_34 ALS_31 CTR_35
## 19.1 19.4 18.3 17.3 19.1 21.1 18.2 20.3
## ALS_32 ALS_33 CTR_36 CTR_37 ALS_34 ALS_35 ALS_36 SYMP_5
## 18.3 18.7 17.4 20.5 16.9 19.4 17.9 19.7
## CTR_38 ALS_37 CTR_39 CTR_40 ALS_38 mimic_14 ALS_39 mimic_15
## 19.1 16.1 19.1 18.7 20.9 17.9 20.0 20.4
## CTR_41 CTR_42 CTR_43 ALS_40 ALS_41 CTR_44 SYMP_6 ALS_42
## 21.1 22.1 20.1 22.4 20.1 21.5 20.9 20.5
## ALS_43 ALS_44 SYMP_7 SYMP_8 CTR_45 ALS_45 CTR_46 mimic_16
## 22.5 23.4 23.3 20.4 20.2 20.6 23.5 20.1
## mimic_17 CTR_47 CTR_48 ALS_46 CTR_49 CTR_50 mimic_18 ALS_47
## 20.0 21.6 21.0 21.6 21.3 20.6 21.3 20.2
## CTR_51 CTR_52 mimic_19 CTR_53 CTR_54 CTR_55 CTR_56 ALS_48
## 21.3 21.5 20.9 21.1 21.6 22.6 21.3 21.9
## ALS_49 ALS_50 CTR_57 CTR_58 CTR_59 CTR_60 ALS_51 CTR_61
## 25.6 24.0 22.8 21.6 23.1 23.2 20.6 24.3
## ALS_52 CTR_62 ALS_53 CTR_63 ALS_54 ALS_55 ALS_56 CTR_64
## 20.9 21.4 23.4 22.1 24.3 22.3 25.1 23.2
## ALS_57 mimic_20 ALS_58 SYMP_9 ALS_59 SYMP_10 ALS_60 ALS_61
## 22.1 21.7 22.9 22.8 22.3 24.3 22.1 23.1
## ALS_62 ALS_63 ALS_64 ALS_65 mimic_21 mimic_22 mimic_23 CTR_65
## 22.4 21.4 23.7 22.0 22.8 24.7 25.2 23.4
## CTR_66 mimic_24 mimic_25 CTR_67 ALS_66 CTR_68 N.A._1 ALS_67
## 22.2 23.7 22.9 24.2 22.2 25.4 21.4 23.0
## SYMP_11 CTR_69 ALS_68 CTR_70 mimic_26 SYMP_12 ALS_69 mimic_27
## 23.3 21.7 23.3 23.3 21.6 22.2 26.3 22.4
## ALS_70 SYMP_13 CTR_71 CTR_72 NA._1 ALS_71 mimic_28 NA._2
## 23.2 22.8 23.9 23.2 23.2 25.2 21.6 22.6
## ALS_72 CTR_73 CTR_74 CTR_75 ALS_73 CTR_76 NA._3 CTR_77
## 22.9 20.1 23.1 23.9 26.0 22.3 23.3 23.5
## ALS_74 ALS_75 N.A._2 mimic_29 SYMP_14 CTR_78 CTR_79 ALS_76
## 23.3 23.4 22.2 23.4 28.9 27.6 27.7 31.1
## CTR_80 NA._4 SYMP_15 ALS_77 ALS_78 PGMC_86 ALS_79 mimic_30
## 29.3 25.0 25.1 27.0 27.0 25.7 28.9 24.4
## CTR_81 ALS_80 ALS_81 ALS_82 ALS_83 ALS_84 SYMP_16 ALS_85
## 25.8 27.5 25.8 25.9 28.1 25.7 26.3 25.4
## CTR_82 ALS_86 ALS_87 CTR_83 SYMP_17 ALS_88 ALS_89 SYMP_18
## 23.0 24.2 25.3 26.7 23.0 25.7 24.7 25.8
## mimic_31 ALS_90 mimic_32 ALS_91
## 26.9 27.1 27.6 23.2
round(apply(X = as.data.frame(assay(lipidomics_data_plasma$se_filtered)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_plasma$se_filtered))) * 100 , 1)
## PGMC_1 PGMC_2 PGMC_3 PGMC_4 PGMC_5 PGMC_6 PGMC_7 PGMC_8
## 1.6 2.5 4.8 3.9 2.3 1.8 0.9 1.2
## PGMC_9 PGMC_10 PGMC_11 PGMC_12 PGMC_13 PGMC_14 PGMC_15 PGMC_16
## 1.6 2.6 2.9 3.4 1.8 0.7 1.2 1.3
## PGMC_17 PGMC_18 PGMC_19 PGMC_20 PGMC_21 PGMC_22 PGMC_23 PGMC_24
## 1.5 1.8 6.6 7.9 3.7 6.3 7.6 2.9
## PGMC_25 PGMC_26 PGMC_27 PGMC_28 PGMC_29 PGMC_30 PGMC_31 PGMC_32
## 5.8 5.7 3.9 2.8 1.3 1.6 1.6 1.3
## PGMC_33 PGMC_34 PGMC_35 PGMC_36 PGMC_37 PGMC_38 PGMC_39 PGMC_40
## 1.3 0.7 1.3 2.6 2.3 1.5 2.0 1.3
## PGMC_41 PGMC_42 PGMC_43 PGMC_44 PGMC_45 PGMC_46 PGMC_47 PGMC_48
## 3.4 2.8 0.6 1.2 0.9 2.0 1.3 0.9
## PGMC_49 PGMC_50 PGMC_51 PGMC_52 PGMC_53 PGMC_54 PGMC_55 PGMC_56
## 1.3 1.6 2.9 5.8 1.2 1.0 2.2 1.5
## PGMC_57 PGMC_58 PGMC_59 PGMC_60 PGMC_61 PGMC_62 PGMC_63 PGMC_64
## 1.8 3.5 6.1 1.8 0.4 2.8 2.2 0.6
## PGMC_65 PGMC_66 PGMC_67 PGMC_68 PGMC_69 PGMC_70 PGMC_71 PGMC_72
## 2.0 1.2 1.3 0.4 0.9 1.9 2.3 0.4
## PGMC_73 PGMC_74 PGMC_75 PGMC_76 PGMC_77 PGMC_78 PGMC_79 PGMC_80
## 2.9 1.0 2.8 0.9 5.4 1.3 1.2 1.0
## PGMC_81 PGMC_82 PGMC_83 PGMC_84 PGMC_85 ALS_1 CTR_1 CTR_2
## 0.4 1.0 1.8 7.0 4.2 2.8 0.9 1.8
## ALS_2 CTR_3 ALS_3 mimic_1 CTR_4 ALS_4 ALS_5 ALS_6
## 0.4 0.9 0.9 4.2 1.0 0.6 0.7 1.9
## SYMP_1 mimic_2 ALS_7 CTR_5 CTR_6 mimic_3 ALS_8 ALS_9
## 1.0 0.6 0.9 1.3 0.7 2.3 1.9 1.9
## CTR_7 CTR_8 CTR_9 SYMP_2 CTR_10 SYMP_3 CTR_11 ALS_10
## 5.8 0.3 0.9 0.9 1.3 1.0 1.5 0.9
## ALS_11 ALS_12 CTR_12 SYMP_4 mimic_4 ALS_13 CTR_13 ALS_14
## 0.6 0.1 1.0 0.4 1.8 0.7 0.1 0.4
## ALS_15 CTR_14 mimic_5 mimic_6 mimic_7 ALS_16 CTR_15 CTR_16
## 0.7 2.0 0.3 2.9 1.9 4.1 0.6 1.0
## mimic_8 ALS_17 CTR_17 ALS_18 CTR_18 mimic_9 CTR_19 CTR_20
## 0.9 1.6 1.0 1.3 1.9 2.0 2.3 1.6
## mimic_10 CTR_21 CTR_22 mimic_11 ALS_19 CTR_23 mimic_12 ALS_20
## 0.9 1.5 2.2 0.9 0.3 1.5 1.3 0.6
## ALS_21 ALS_22 ALS_23 ALS_24 CTR_24 CTR_25 ALS_25 CTR_26
## 6.3 0.6 1.3 0.3 2.8 1.9 3.9 1.5
## CTR_27 ALS_26 ALS_27 CTR_28 CTR_29 CTR_30 CTR_31 CTR_32
## 1.9 1.0 0.6 0.4 0.4 2.2 2.0 1.8
## CTR_33 ALS_28 ALS_29 mimic_13 ALS_30 CTR_34 ALS_31 CTR_35
## 2.2 1.5 1.9 0.4 2.5 2.6 1.0 3.5
## ALS_32 ALS_33 CTR_36 CTR_37 ALS_34 ALS_35 ALS_36 SYMP_5
## 0.7 1.0 0.4 4.1 0.9 2.6 1.6 1.5
## CTR_38 ALS_37 CTR_39 CTR_40 ALS_38 mimic_14 ALS_39 mimic_15
## 1.6 0.1 1.0 2.0 2.9 0.9 1.2 1.3
## CTR_41 CTR_42 CTR_43 ALS_40 ALS_41 CTR_44 SYMP_6 ALS_42
## 1.2 2.0 0.7 1.5 1.2 2.2 0.9 1.2
## ALS_43 ALS_44 SYMP_7 SYMP_8 CTR_45 ALS_45 CTR_46 mimic_16
## 2.8 3.8 2.9 0.6 1.0 2.0 3.9 0.6
## mimic_17 CTR_47 CTR_48 ALS_46 CTR_49 CTR_50 mimic_18 ALS_47
## 0.9 1.2 0.9 2.2 1.9 0.6 0.9 1.2
## CTR_51 CTR_52 mimic_19 CTR_53 CTR_54 CTR_55 CTR_56 ALS_48
## 0.7 1.0 0.9 0.6 1.2 2.5 0.9 1.2
## ALS_49 ALS_50 CTR_57 CTR_58 CTR_59 CTR_60 ALS_51 CTR_61
## 5.4 3.5 1.8 1.6 1.0 2.5 0.6 4.1
## ALS_52 CTR_62 ALS_53 CTR_63 ALS_54 ALS_55 ALS_56 CTR_64
## 0.6 1.3 4.5 1.9 3.1 1.3 3.9 2.3
## ALS_57 mimic_20 ALS_58 SYMP_9 ALS_59 SYMP_10 ALS_60 ALS_61
## 1.5 1.6 2.3 1.9 1.9 2.9 1.6 2.0
## ALS_62 ALS_63 ALS_64 ALS_65 mimic_21 mimic_22 mimic_23 CTR_65
## 1.8 0.7 2.0 1.3 1.9 4.5 3.9 2.2
## CTR_66 mimic_24 mimic_25 CTR_67 ALS_66 CTR_68 N.A._1 ALS_67
## 1.5 2.6 2.0 2.9 2.2 3.9 1.2 1.9
## SYMP_11 CTR_69 ALS_68 CTR_70 mimic_26 SYMP_12 ALS_69 mimic_27
## 2.2 1.2 3.1 2.6 1.2 1.8 6.0 1.5
## ALS_70 SYMP_13 CTR_71 CTR_72 NA._1 ALS_71 mimic_28 NA._2
## 1.9 2.2 3.1 2.2 2.6 4.5 1.2 1.8
## ALS_72 CTR_73 CTR_74 CTR_75 ALS_73 CTR_76 NA._3 CTR_77
## 1.6 0.9 2.8 3.5 6.1 2.2 2.0 2.8
## ALS_74 ALS_75 N.A._2 mimic_29 SYMP_14 CTR_78 CTR_79 ALS_76
## 3.4 2.6 1.5 2.5 8.0 6.7 6.4 10.8
## CTR_80 NA._4 SYMP_15 ALS_77 ALS_78 PGMC_86 ALS_79 mimic_30
## 9.1 4.2 4.2 6.6 5.7 4.8 8.8 3.2
## CTR_81 ALS_80 ALS_81 ALS_82 ALS_83 ALS_84 SYMP_16 ALS_85
## 4.5 7.0 4.7 4.7 7.9 4.7 4.7 4.5
## CTR_82 ALS_86 ALS_87 CTR_83 SYMP_17 ALS_88 ALS_89 SYMP_18
## 2.2 3.4 4.4 5.4 2.5 4.7 3.7 4.2
## mimic_31 ALS_90 mimic_32 ALS_91
## 6.3 6.7 6.4 2.6
#normalization
lipidomics_data_plasma$se_filt_norm <- normalize_vsn(lipidomics_data_plasma$se_filtered)
DEP::meanSdPlot(lipidomics_data_plasma$se_filt_norm)
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filtered)),"results/data_filtered_plasma.xlsx")
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filt_norm)),"results/data_filtered_normalized_plasma.xlsx")
# imputation
lipidomics_data_plasma$se_filt_impute <- impute(
lipidomics_data_plasma$se_filt_norm,fun = "MinProb",q = 0.01)
## Imputing along margin 2 (samples/columns).
## [1] 0.6924497
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filt_impute)),"results/data_filtered_normalized_imputed_plasma.xlsx")
The seventh step is to create visualizations, such as heatmaps and PCAs (for each fluid with ID/batch info, and for both fluids together)
## Distribution of expression values per ID and tube number
mean_expression_plot(data = t(assay(lipidomics_data_CSF$se)),
file_sample = "plots/boxplots_expression_each_sample_CSF.pdf",
file_mass = "plots/boxplots_expression_each_lipid_CSF.pdf",
title = "CSF lipidomics")
## Heatmaps
# CSF
title = "lipidomics CSF"
data = assay(lipidomics_data_CSF$se)
#row annotation
names_lipids = rownames(data)
Lipids_final <- read_excel("data input/RE_ metabolomics/Lipids_final.xlsx")
lipids = Lipids_final %>%
select(Name,Category) %>%
filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
lipid_type = list(mycolors)[[1]])
#without grouping, all lipids
p = make_pheatmap(data = data,
cluster_cols = F,
main = paste0("Heatmap all lipids\n",title, "\n not clustered"),
show_rownames = F,
labels_col = colnames(data))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_",title,".pdf"))
## quartz_off_screen
## 2
#without grouping, filtered lipids
data = assay(lipidomics_data_CSF$se_filt_norm)
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>%
select(Name,Category) %>%
filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
lipid_type = list(mycolors)[[1]])
p = make_pheatmap(data = data,
cluster_cols = F,
main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"),
show_rownames = F,
labels_col = colnames(data))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_filtered_",title,".pdf"))
## quartz_off_screen
## 2
#without grouping, filtered and imputed
data = assay(lipidomics_data_CSF$se_filt_impute)
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>%
select(Name,Category) %>%
filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
lipid_type = list(mycolors)[[1]])
p = make_pheatmap(data = data,
cluster_cols = F,
main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"),
show_rownames = F,
labels_col = colnames(data))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_",title,".pdf"))
## quartz_off_screen
## 2
# without grouping, 100 most variable lipids
d = assay(lipidomics_data_CSF$se)
d2 = head(order(rowVars(d),decreasing = T),100)
data = d[d2,]
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>%
select(Name,Category) %>%
filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
lipid_type = list(mycolors)[[1]])
p = make_pheatmap(data = d[d2,],
cluster_cols = F,
main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
labels_col = colnames(d))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_mostvar_",title,".pdf"))
## quartz_off_screen
## 2
## UMAP (with participant status type)
d_CSF = t(assay(lipidomics_data_CSF$se_filt_impute))
labels_group = tube_batch_CSF$type
group = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4","orange3")
UMAP_density_plot(data = d_CSF,
ggtitle = paste0("UMAP with type labels\n", title),
legend_name = "Fluid labels",
labels = labels_group,
file_location = paste0("plots/UMAP_type_",title,".pdf"),
file_location_labels =paste0("plots/UMAP_type_labels_",title,".pdf"),
colour_set = group)
# UMAP (with batch info)
labels_group = tube_batch_CSF$Batch
group = c("#3DB7E4", "#FF8849", "#69BE28")
UMAP_density_plot(data = d_CSF,
ggtitle = paste0("UMAP with batch labels\n", title),
legend_name = "Fluid labels",
labels = labels_group,
file_location = paste0("plots/UMAP_batch_",title,".pdf"),
file_location_labels =paste0("plots/UMAP_batch_labels_",title,".pdf"),
colour_set = group)
## Distribution of expression values per ID and tube number
mean_expression_plot(data = t(assay(lipidomics_data_plasma$se)),
file_sample = "plots/boxplots_expression_each_sample_plasma.pdf",
file_mass = "plots/boxplots_expression_each_lipid_plasma.pdf",
title = "plasma lipidomics")
## Heatmaps
# plasma
title = "lipidomics plasma"
data = assay(lipidomics_data_plasma$se)
#row annotation
names_lipids = rownames(data)
Lipids_final <- read_excel("data input/RE_ metabolomics/Lipids_final.xlsx")
lipids = Lipids_final %>%
select(Name,Category) %>%
filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
lipid_type = list(mycolors)[[1]])
#without grouping, all lipids
p = make_pheatmap(data = data,
cluster_cols = F,
main = paste0("Heatmap all lipids\n",title, "\n not clustered"),
show_rownames = F,
labels_col = colnames(data))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_",title,".pdf"))
## quartz_off_screen
## 2
#without grouping, filtered lipids
data = assay(lipidomics_data_plasma$se_filt_norm)
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>%
select(Name,Category) %>%
filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
lipid_type = list(mycolors)[[1]])
p = make_pheatmap(data = data,
cluster_cols = F,
main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"),
show_rownames = F,
labels_col = colnames(data))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_filtered_",title,".pdf"))
## quartz_off_screen
## 2
#without grouping, filtered and imputed
data = assay(lipidomics_data_plasma$se_filt_impute)
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>%
select(Name,Category) %>%
filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
lipid_type = list(mycolors)[[1]])
p = make_pheatmap(data = data,
cluster_cols = F,
main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"),
show_rownames = F,
labels_col = colnames(data))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_",title,".pdf"))
## quartz_off_screen
## 2
# without grouping, 100 most variable lipids
d = assay(lipidomics_data_plasma$se)
d2 = head(order(rowVars(d),decreasing = T),100)
data = d[d2,]
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>%
select(Name,Category) %>%
filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
lipid_type = list(mycolors)[[1]])
p = make_pheatmap(data = d[d2,],
cluster_cols = F,
main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
labels_col = colnames(d))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_mostvar_",title,".pdf"))
## quartz_off_screen
## 2
## UMAP (with participant status type)
d_plasma = t(assay(lipidomics_data_plasma$se_filt_impute))
labels_group = tube_batch_plasma$type
group = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4","orange3")
UMAP_density_plot(data = d_plasma,
ggtitle = paste0("UMAP with type labels\n", title),
legend_name = "Fluid labels",
labels = labels_group,
file_location = paste0("plots/UMAP_type_",title,".pdf"),
file_location_labels =paste0("plots/UMAP_type_labels_",title,".pdf"),
colour_set = group)
# UMAP (with batch info)
labels_group = tube_batch_plasma$Batch
group = c("#3DB7E4", "#FF8849", "#69BE28","#757083","#CF7684","#A0C8C0","#CDAE3B")
UMAP_density_plot(data = d_plasma,
ggtitle = paste0("UMAP with batch labels\n", title),
legend_name = "Fluid labels",
labels = labels_group,
file_location = paste0("plots/UMAP_batch_",title,".pdf"),
file_location_labels =paste0("plots/UMAP_batch_labels_",title,".pdf"),
colour_set = group)
proteins_in_both_fluids = colnames(d_CSF)[colnames(d_CSF) %in% colnames(d_plasma)]
d_1 = d_CSF[,proteins_in_both_fluids]
d_2 = d_plasma[,proteins_in_both_fluids]
d = do.call("rbind",list(d_1,d_2))
labels_group = c(rep("CSF", 144), rep("Plasma", 316))
title = "plasma_vs_CSF"
d <- is.na(d)
dim(d)
## [1] 460 183
#perform plots with function
UMAP_density_plot(data = d,
ggtitle = paste0("UMAP with fluid labels\n", title),
legend_name = "Fluid labels",
labels = labels_group,
file_location = paste0("plots/UMAP_fluid_group_lipids_",title,".pdf"),
file_location_labels=paste0("plots/UMAP_fluid_group_labels_",title,".pdf"),
colour_set = group)
The eighth step is to perform differential expression analyses, with particular interest of eALS vs CTR, PGMC vs CTR. For this, the information between IDs and the status (eALS, CTR, PGMC, mimic) have to be collected from other datasets